The goal of this project is to find the most accurate model using a variety of statistical model building techniques in order to predict medical outcomes relating to breast cancer. We will use the BRCA Multi-Omics (TCGA) data from Kaggle, and predict outcomes for the PR.Status, ER.Status, HER2.Final.Status, and histological.type. For our approach, we will first perform the necessary data cleaning operations, then [method 1] and [method2] to build classification models for PR.Status and [method1] and [method2] for histological.type, using classification error and AUC respectively as the evalution criteria. Next, we will use [method] to build a model with the goal of accurately predicting all four outcomes. Finally, [INSERT CONCLUSION HERE].
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
brca_data = read.csv("https://raw.githubusercontent.com/mgarbvs/STAT-432-final-project/main/cleaned_brca_data.csv")
# remove X.1 and X columns
brca_data = brca_data %>% select(-c("X.1", "X"))
head(brca_data, 10)
Structure of data:
#str(brca_data)
PR.Status, ER.Status, and HER2.Final.Status are determined using immunohistochemistry scoring. For these variables, we will only consider two levels: “Positive” and “Negative”. For histological.type, we will only consider “infiltrating lobular carcinoma” and “infiltrating ductal carcinoma”. You can treat all other categories as missing values. Hence, all four outcomes should be binary.
which(colnames(brca_data)=="PR.Status")
## [1] 1937
which(colnames(brca_data)=="ER.Status")
## [1] 1938
which(colnames(brca_data)=="HER2.Final.Status")
## [1] 1939
which(colnames(brca_data)=="histological.type")
## [1] 1940
Check for null values:
colSums(is.na(brca_data[, 1937:1940]))
## PR.Status ER.Status HER2.Final.Status histological.type
## 0 0 0 0
– check for closely correlated variables and leave only one
suppressMessages(library(caret))
suppressMessages(library(dplyr))
# store the predictors
predictors = brca_data %>% select(c("PR.Status", "ER.Status", "HER2.Final.Status", "histological.type"))
# remove the categorical var temporarily
brca_data = brca_data %>% select(-c("PR.Status", "ER.Status", "HER2.Final.Status", "histological.type"))
# use correlation matrix
#cor_mat = cor(brca_data)
# returns vector of indices to remove
#cor_list = findCorrelation(cor_mat, cutoff = .8)
Drop the highly correlated columns
#brca_data = brca_data[, -c(cor_list)]
#brca_data
We still have 1087 col left…
# categorical variables
brca_data[605:1464]
brca_data[1465:1713]
# only use variances using the corr mat
non_categorical_brca = cbind(brca_data[1:604], brca_data[1714:1936])
# use correlation matrix
cor_mat = cor(non_categorical_brca)
# returns vector of indices to remove
cor_list = findCorrelation(cor_mat, cutoff = .7)
non_categorical_brca = non_categorical_brca[, -c(cor_list)]
non_categorical_brca
Split the data:
## 75% of the sample size
smp_size <- floor(0.75 * nrow(non_categorical_brca))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(non_categorical_brca)), size = smp_size)
brca_train <- as.matrix(non_categorical_brca[train_ind, ])
brca_test <- as.matrix(non_categorical_brca[-train_ind, ])
train_y = as.matrix(predictors$PR.Status[train_ind])
brca_train_1 = as.matrix(cbind(brca_train, train_y))
{r} # library(kknn) # # control <- trainControl(method = "cv", number = 10) # knn.cvfit <- train(y ~ ., method = "knn", # data = data.frame("x" = brca_train_1, "y" = as.factor(predictors$PR.Status[train_ind])), # tuneGrid = data.frame(k = seq(1, 40, 1)), # trControl = control) #{r} # plot(knn.cvfit$results$k, 1-knn.cvfit$results$Accuracy, # xlab = "K", ylab = "Classification Error", type = "b", # pch = 19, col = "darkorange") #{r} # which.min(1-knn.cvfit$results$Accuracy) #The best k is 10
library(caret)
library(class)
train_y = as.matrix(predictors$PR.Status[train_ind])
# brca_train = cbind(brca_train, train_y)
y = as.matrix(predictors$PR.Status[train_ind])
pred = knn.fit = knn(train = brca_train, test = brca_test, cl = y, k = 10)
cmat = table(pred, as.factor(predictors$PR.Status[-train_ind]))
cmat
##
## pred Negative Positive
## Negative 23 3
## Positive 20 81
Accuracy:
(81 + 23)/sum(cmat)
## [1] 0.8188976
suppressMessages(library(glmnet))
set.seed(1)
lasso.fit = cv.glmnet(brca_train, train_y, nfolds = 10, alpha = 1, type.measure = "class", family = "binomial")
par(mfrow = c(1, 2))
plot(lasso.fit)
plot(lasso.fit$glmnet.fit, "lambda")
#
{r} # coef(lasso.fit, s = "lambda.min") #
{r} # coef(lasso.fit, s = "lambda.1se") #Predict on the test data:
best_lambda = lasso.fit$lambda.min
test_predict = predict(lasso.fit, brca_test, s = best_lambda, type = "class")
# test_predict
# length(test_predict)
# length(predictors$PR.Status[-train_ind])
cmat_1 = table(predictors$histological.type[-train_ind], test_predict)
cmat_1
## test_predict
## Negative Positive
## infiltrating ductal carcinoma 29 85
## infiltrating lobular carcinoma 1 12
Accuracy:
(25 + 79)/sum(cmat_1)
## [1] 0.8188976
The classification error is 0.1811024
histological.type with Logistic (with Ridge and Lasso penalty)Next, let us model histological.type, using AUC as the evaluation criterion.
First, use Penalized Logistic Regression:
library(glmnet)
require(methods)
train_y_hist= as.matrix(predictors$histological.type[train_ind])
brca_train_2 = as.matrix(cbind(brca_train, train_y_hist))
# need to force matrix as dgCMatrix for some reason??: https://stackoverflow.com/questions/8458233/r-glmnet-as-matrix-error-message
# using the train data and 10-fold CV
logistic.fit = cv.glmnet(x = as(data.matrix(brca_train_2[, -c(640)]), "dgCMatrix"), y = brca_train_2[, 640], nfold = 10, family = "binomial")
plot(logistic.fit)
Next, use this model to predict on the test data:
library(ROCR)
test_predict <- predict(logistic.fit, brca_test, type="response")
roc <- prediction(test_predict, predictors$histological.type[-train_ind])
# calculates the ROC curve
perf <- performance(roc,"tpr","fpr")
plot(perf,colorize=TRUE)
The AUC:
performance(roc, measure = "auc")@y.values[[1]]
## [1] 0.9473684
Next, try with Ridge:
library(glmnet)
set.seed(432)
ridge.fit = cv.glmnet(as(data.matrix(brca_train_2[, -c(640)]), "dgCMatrix"), y = brca_train_2[, 640], nfolds = 10, alpha = 0, family = "binomial")
plot(ridge.fit$glmnet.fit, "lambda")
Next, use this model to predict on the test data:
library(ROCR)
test_predict <- predict(ridge.fit, brca_test, type="response")
roc <- prediction(test_predict, predictors$histological.type[-train_ind])
# calculates the ROC curve
perf <- performance(roc,"tpr","fpr")
plot(perf,colorize=TRUE)
The AUC:
performance(roc, measure = "auc")@y.values[[1]]
## [1] 0.8407557
The AUC for logistic regression with ridge penalty is lower than the lasso penalty.
histological.type with Kernellibrary(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(432)
rf.fit = randomForest(data.matrix(brca_train_2[, -c(640)]), y = as.factor(brca_train_2[, 640]), ntree = 4, mtry = 4, nodesize = 10, sampsize = 50)
Next, use this model to predict on the test data:
library(ROCR)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
test.predictions <- predict(rf.fit, type="prob", newdata=brca_test)[,2]
correct_hist = as.factor(predictors$histological.type[-train_ind])
rf_pr_test <- prediction(test.predictions, correct_hist)
r_auc_test <- performance(rf_pr_test, "tpr","fpr")
plot(r_auc_test, colorize=TRUE)
The AUC:
performance(rf_pr_test, measure = "auc")@y.values[[1]]
## [1] 0.7672065
this is bad